import sys
from time import perf_counter
import pandas as pd
import numpy as np
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import KFold, cross_val_score, HalvingGridSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
import keras_tuner
import tensorflow as tf
from keras.models import Model
from keras.layers import *
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint
# plotly settings
px.defaults.width = 800
px.defaults.height = 450
# to have the comp metric
def rmse(y, y_pred):
return np.sqrt(mean_squared_error(y, y_pred))
# helper for plotting
def plotly_scatter(df, f1, f2):
fig = px.scatter(x=df[f1], y=df[f2]).update_layout(
xaxis_title=f1, yaxis_title=f2, title=f'{f2} vs {f1}')
fig.show()
# load the data
train = pd.read_parquet(f'./data/train.parquet')
test = pd.read_parquet(f'./data/test.parquet')
# inspect
train.head(20)
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 60 | RL | 65.0 | 8450 | Pave | None | Reg | Lvl | AllPub | ... | 0 | None | None | None | 0 | 2 | 2008 | WD | Normal | 208500 |
| 1 | 2 | 20 | RL | 80.0 | 9600 | Pave | None | Reg | Lvl | AllPub | ... | 0 | None | None | None | 0 | 5 | 2007 | WD | Normal | 181500 |
| 2 | 3 | 60 | RL | 68.0 | 11250 | Pave | None | IR1 | Lvl | AllPub | ... | 0 | None | None | None | 0 | 9 | 2008 | WD | Normal | 223500 |
| 3 | 4 | 70 | RL | 60.0 | 9550 | Pave | None | IR1 | Lvl | AllPub | ... | 0 | None | None | None | 0 | 2 | 2006 | WD | Abnorml | 140000 |
| 4 | 5 | 60 | RL | 84.0 | 14260 | Pave | None | IR1 | Lvl | AllPub | ... | 0 | None | None | None | 0 | 12 | 2008 | WD | Normal | 250000 |
| 5 | 6 | 50 | RL | 85.0 | 14115 | Pave | None | IR1 | Lvl | AllPub | ... | 0 | None | MnPrv | Shed | 700 | 10 | 2009 | WD | Normal | 143000 |
| 6 | 7 | 20 | RL | 75.0 | 10084 | Pave | None | Reg | Lvl | AllPub | ... | 0 | None | None | None | 0 | 8 | 2007 | WD | Normal | 307000 |
| 7 | 8 | 60 | RL | NaN | 10382 | Pave | None | IR1 | Lvl | AllPub | ... | 0 | None | None | Shed | 350 | 11 | 2009 | WD | Normal | 200000 |
| 8 | 9 | 50 | RM | 51.0 | 6120 | Pave | None | Reg | Lvl | AllPub | ... | 0 | None | None | None | 0 | 4 | 2008 | WD | Abnorml | 129900 |
| 9 | 10 | 190 | RL | 50.0 | 7420 | Pave | None | Reg | Lvl | AllPub | ... | 0 | None | None | None | 0 | 1 | 2008 | WD | Normal | 118000 |
| 10 | 11 | 20 | RL | 70.0 | 11200 | Pave | None | Reg | Lvl | AllPub | ... | 0 | None | None | None | 0 | 2 | 2008 | WD | Normal | 129500 |
| 11 | 12 | 60 | RL | 85.0 | 11924 | Pave | None | IR1 | Lvl | AllPub | ... | 0 | None | None | None | 0 | 7 | 2006 | New | Partial | 345000 |
| 12 | 13 | 20 | RL | NaN | 12968 | Pave | None | IR2 | Lvl | AllPub | ... | 0 | None | None | None | 0 | 9 | 2008 | WD | Normal | 144000 |
| 13 | 14 | 20 | RL | 91.0 | 10652 | Pave | None | IR1 | Lvl | AllPub | ... | 0 | None | None | None | 0 | 8 | 2007 | New | Partial | 279500 |
| 14 | 15 | 20 | RL | NaN | 10920 | Pave | None | IR1 | Lvl | AllPub | ... | 0 | None | GdWo | None | 0 | 5 | 2008 | WD | Normal | 157000 |
| 15 | 16 | 45 | RM | 51.0 | 6120 | Pave | None | Reg | Lvl | AllPub | ... | 0 | None | GdPrv | None | 0 | 7 | 2007 | WD | Normal | 132000 |
| 16 | 17 | 20 | RL | NaN | 11241 | Pave | None | IR1 | Lvl | AllPub | ... | 0 | None | None | Shed | 700 | 3 | 2010 | WD | Normal | 149000 |
| 17 | 18 | 90 | RL | 72.0 | 10791 | Pave | None | Reg | Lvl | AllPub | ... | 0 | None | None | Shed | 500 | 10 | 2006 | WD | Normal | 90000 |
| 18 | 19 | 20 | RL | 66.0 | 13695 | Pave | None | Reg | Lvl | AllPub | ... | 0 | None | None | None | 0 | 6 | 2008 | WD | Normal | 159000 |
| 19 | 20 | 20 | RL | 70.0 | 7560 | Pave | None | Reg | Lvl | AllPub | ... | 0 | None | MnPrv | None | 0 | 5 | 2009 | COD | Abnorml | 139000 |
20 rows × 81 columns
# check dtypes.. we can see a lot of non numeric features
train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1460 entries, 0 to 1459 Data columns (total 81 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Id 1460 non-null int64 1 MSSubClass 1460 non-null int64 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float64 4 LotArea 1460 non-null int64 5 Street 1460 non-null object 6 Alley 91 non-null object 7 LotShape 1460 non-null object 8 LandContour 1460 non-null object 9 Utilities 1460 non-null object 10 LotConfig 1460 non-null object 11 LandSlope 1460 non-null object 12 Neighborhood 1460 non-null object 13 Condition1 1460 non-null object 14 Condition2 1460 non-null object 15 BldgType 1460 non-null object 16 HouseStyle 1460 non-null object 17 OverallQual 1460 non-null int64 18 OverallCond 1460 non-null int64 19 YearBuilt 1460 non-null int64 20 YearRemodAdd 1460 non-null int64 21 RoofStyle 1460 non-null object 22 RoofMatl 1460 non-null object 23 Exterior1st 1460 non-null object 24 Exterior2nd 1460 non-null object 25 MasVnrType 1452 non-null object 26 MasVnrArea 1452 non-null float64 27 ExterQual 1460 non-null object 28 ExterCond 1460 non-null object 29 Foundation 1460 non-null object 30 BsmtQual 1423 non-null object 31 BsmtCond 1423 non-null object 32 BsmtExposure 1422 non-null object 33 BsmtFinType1 1423 non-null object 34 BsmtFinSF1 1460 non-null int64 35 BsmtFinType2 1422 non-null object 36 BsmtFinSF2 1460 non-null int64 37 BsmtUnfSF 1460 non-null int64 38 TotalBsmtSF 1460 non-null int64 39 Heating 1460 non-null object 40 HeatingQC 1460 non-null object 41 CentralAir 1460 non-null object 42 Electrical 1459 non-null object 43 1stFlrSF 1460 non-null int64 44 2ndFlrSF 1460 non-null int64 45 LowQualFinSF 1460 non-null int64 46 GrLivArea 1460 non-null int64 47 BsmtFullBath 1460 non-null int64 48 BsmtHalfBath 1460 non-null int64 49 FullBath 1460 non-null int64 50 HalfBath 1460 non-null int64 51 BedroomAbvGr 1460 non-null int64 52 KitchenAbvGr 1460 non-null int64 53 KitchenQual 1460 non-null object 54 TotRmsAbvGrd 1460 non-null int64 55 Functional 1460 non-null object 56 Fireplaces 1460 non-null int64 57 FireplaceQu 770 non-null object 58 GarageType 1379 non-null object 59 GarageYrBlt 1379 non-null float64 60 GarageFinish 1379 non-null object 61 GarageCars 1460 non-null int64 62 GarageArea 1460 non-null int64 63 GarageQual 1379 non-null object 64 GarageCond 1379 non-null object 65 PavedDrive 1460 non-null object 66 WoodDeckSF 1460 non-null int64 67 OpenPorchSF 1460 non-null int64 68 EnclosedPorch 1460 non-null int64 69 3SsnPorch 1460 non-null int64 70 ScreenPorch 1460 non-null int64 71 PoolArea 1460 non-null int64 72 PoolQC 7 non-null object 73 Fence 281 non-null object 74 MiscFeature 54 non-null object 75 MiscVal 1460 non-null int64 76 MoSold 1460 non-null int64 77 YrSold 1460 non-null int64 78 SaleType 1460 non-null object 79 SaleCondition 1460 non-null object 80 SalePrice 1460 non-null int64 dtypes: float64(3), int64(35), object(43) memory usage: 924.0+ KB
# drop Id col but save test_IDs for submission
train.drop("Id", axis = 1, inplace = True)
test_IDs = test["Id"]
test.drop("Id", axis = 1, inplace = True)
# check linear corr
corr = train.corr()
plt.subplots(figsize=(15,12))
sns.heatmap(corr, vmax=0.9, cmap="Blues", square=True)
C:\Users\gyenist\AppData\Local\Temp\ipykernel_6324\3622780105.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. corr = train.corr()
<AxesSubplot: >
# check the highest corrs
corrs = train.corr()['SalePrice'].sort_values(ascending=False)[1:21]
px.bar(corrs).update_layout(
xaxis_title='feature', yaxis_title='correlation', title='Correlations')
C:\Users\gyenist\AppData\Local\Temp\ipykernel_6324\2144235933.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. corrs = train.corr()['SalePrice'].sort_values(ascending=False)[1:21]
# check the outliers for the most highly correlated features
for c in corrs.index[:5]:
plotly_scatter(train, c, 'SalePrice')
# filter outliers based on the plots
train.drop(train[(train['OverallQual']<5) & (train['SalePrice']>200000)].index, inplace=True)
train.drop(train[(train['GrLivArea']>4500) & (train['SalePrice']<300000)].index, inplace=True)
train.drop(train[(train['TotalBsmtSF']>4500)].index, inplace=True)
train.drop(train[(train['GarageArea']>1220)].index, inplace=True)
train.reset_index(drop=True, inplace=True)
# check target distribution
fig = ff.create_distplot([train["SalePrice"]], group_labels=['distplot'], bin_size=10000)
fig.update_layout(title_text='Curve and Rug Plot')
fig.show()
We can see that the SalePrice targets have a slight positive skew. We can correct this with a log transform.
# transform targets
train["SalePrice"] = np.log1p(train["SalePrice"])
fig = ff.create_distplot([train["SalePrice"]], group_labels=['distplot'], bin_size=.05)
fig.update_layout(title_text='Curve and Rug Plot')
fig.show()
# separate the targets
y_train = train['SalePrice'].reset_index(drop=True)
train_features = train.drop(['SalePrice'], axis=1)
test_features = test
# combine train and test features, before cleaning
all_features = pd.concat([train_features, test_features]).reset_index(drop=True)
all_features.shape
(2913, 79)
# convert non cardinal numerical features into strings
all_features['MSSubClass'] = all_features['MSSubClass'].apply(str)
all_features['YrSold'] = all_features['YrSold'].astype(str)
all_features['MoSold'] = all_features['MoSold'].astype(str)
# the data description states that NA refers to typical ('Typ') values
all_features['Functional'] = all_features['Functional'].fillna('Typ')
all_features['Electrical'] = all_features['Electrical'].fillna("SBrkr")
all_features['KitchenQual'] = all_features['KitchenQual'].fillna("TA")
# the data description stats that NA refers to "no pool
all_features["PoolQC"] = all_features["PoolQC"].fillna("None")
# description says nan means no alley access
all_features["Alley"] = all_features["Alley"].fillna("None")
# replace with 0
for col in ('GarageArea', 'GarageCars'):
all_features[col] = all_features[col].fillna(0)
# nans for these categorical basement features, means there's no basement
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
all_features[col] = all_features[col].fillna('None')
# fill with the median of the neighborhood
all_features['LotFrontage'] = all_features.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
# fill the rest, which we can't interpret with None and 0
objects = []
for i in all_features.columns:
if all_features[i].dtype == object:
objects.append(i)
all_features.update(all_features[objects].fillna('None'))
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric = []
for i in all_features.columns:
if all_features[i].dtype in numeric_dtypes:
numeric.append(i)
all_features.update(all_features[numeric].fillna(0))
all_features.describe()
| LotFrontage | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | BsmtFinSF1 | BsmtFinSF2 | BsmtUnfSF | ... | GarageYrBlt | GarageCars | GarageArea | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 | ... | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 |
| mean | 69.403021 | 10113.405424 | 6.088225 | 5.566083 | 1971.297288 | 1984.251287 | 100.913835 | 438.686577 | 49.667353 | 560.149331 | ... | 1870.115345 | 1.763817 | 471.343289 | 93.530381 | 47.260213 | 23.145898 | 2.607621 | 16.095434 | 2.091658 | 50.738414 |
| std | 21.193771 | 7758.911341 | 1.404996 | 1.113345 | 30.290390 | 20.890446 | 178.092327 | 443.958903 | 169.338330 | 438.974982 | ... | 450.114935 | 0.760798 | 213.120051 | 126.410731 | 67.132175 | 64.301832 | 25.213828 | 56.237482 | 34.585013 | 567.904167 |
| min | 21.000000 | 1300.000000 | 1.000000 | 1.000000 | 1872.000000 | 1950.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 60.000000 | 7472.000000 | 5.000000 | 5.000000 | 1953.000000 | 1965.000000 | 0.000000 | 0.000000 | 0.000000 | 220.000000 | ... | 1957.000000 | 1.000000 | 320.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 70.000000 | 9450.000000 | 6.000000 | 5.000000 | 1973.000000 | 1993.000000 | 0.000000 | 368.000000 | 0.000000 | 467.000000 | ... | 1977.000000 | 2.000000 | 479.000000 | 0.000000 | 26.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 75% | 80.000000 | 11526.000000 | 7.000000 | 6.000000 | 2001.000000 | 2004.000000 | 163.000000 | 732.000000 | 0.000000 | 803.000000 | ... | 2001.000000 | 2.000000 | 576.000000 | 168.000000 | 70.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| max | 313.000000 | 215245.000000 | 10.000000 | 9.000000 | 2010.000000 | 2010.000000 | 1600.000000 | 4010.000000 | 1526.000000 | 2336.000000 | ... | 2207.000000 | 5.000000 | 1488.000000 | 1424.000000 | 742.000000 | 1012.000000 | 508.000000 | 576.000000 | 800.000000 | 17000.000000 |
8 rows × 33 columns
# drop features with very low standard deviation -> they contain very little information
all_features = all_features.drop(['Utilities', 'Street', 'PoolQC'], axis=1)
# add some extra features
all_features['BsmtFinType1_Unf'] = 1*(all_features['BsmtFinType1'] == 'Unf')
all_features['HasWoodDeck'] = (all_features['WoodDeckSF'] == 0) * 1
all_features['HasOpenPorch'] = (all_features['OpenPorchSF'] == 0) * 1
all_features['HasEnclosedPorch'] = (all_features['EnclosedPorch'] == 0) * 1
all_features['Has3SsnPorch'] = (all_features['3SsnPorch'] == 0) * 1
all_features['HasScreenPorch'] = (all_features['ScreenPorch'] == 0) * 1
all_features['YearsSinceRemodel'] = all_features['YrSold'].astype(int) - all_features['YearRemodAdd'].astype(int)
all_features['Total_Home_Quality'] = all_features['OverallQual'] + all_features['OverallCond']
all_features['TotalSF'] = all_features['TotalBsmtSF'] + all_features['1stFlrSF'] + all_features['2ndFlrSF']
all_features['YrBltAndRemod'] = all_features['YearBuilt'] + all_features['YearRemodAdd']
all_features['Total_sqr_footage'] = (all_features['BsmtFinSF1'] + all_features['BsmtFinSF2'] +
all_features['1stFlrSF'] + all_features['2ndFlrSF'])
all_features['Total_Bathrooms'] = (all_features['FullBath'] + (0.5 * all_features['HalfBath']) +
all_features['BsmtFullBath'] + (0.5 * all_features['BsmtHalfBath']))
all_features['Total_porch_sf'] = (all_features['OpenPorchSF'] + all_features['3SsnPorch'] +
all_features['EnclosedPorch'] + all_features['ScreenPorch'] +
all_features['WoodDeckSF'])
all_features['haspool'] = all_features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
all_features['has2ndfloor'] = all_features['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
all_features['hasgarage'] = all_features['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
all_features['hasbsmt'] = all_features['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
all_features['hasfireplace'] = all_features['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
all_features.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2913 entries, 0 to 2912 Data columns (total 94 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 MSSubClass 2913 non-null object 1 MSZoning 2913 non-null object 2 LotFrontage 2913 non-null float64 3 LotArea 2913 non-null int64 4 Alley 2913 non-null object 5 LotShape 2913 non-null object 6 LandContour 2913 non-null object 7 LotConfig 2913 non-null object 8 LandSlope 2913 non-null object 9 Neighborhood 2913 non-null object 10 Condition1 2913 non-null object 11 Condition2 2913 non-null object 12 BldgType 2913 non-null object 13 HouseStyle 2913 non-null object 14 OverallQual 2913 non-null int64 15 OverallCond 2913 non-null int64 16 YearBuilt 2913 non-null int64 17 YearRemodAdd 2913 non-null int64 18 RoofStyle 2913 non-null object 19 RoofMatl 2913 non-null object 20 Exterior1st 2913 non-null object 21 Exterior2nd 2913 non-null object 22 MasVnrType 2913 non-null object 23 MasVnrArea 2913 non-null float64 24 ExterQual 2913 non-null object 25 ExterCond 2913 non-null object 26 Foundation 2913 non-null object 27 BsmtQual 2913 non-null object 28 BsmtCond 2913 non-null object 29 BsmtExposure 2913 non-null object 30 BsmtFinType1 2913 non-null object 31 BsmtFinSF1 2913 non-null float64 32 BsmtFinType2 2913 non-null object 33 BsmtFinSF2 2913 non-null float64 34 BsmtUnfSF 2913 non-null float64 35 TotalBsmtSF 2913 non-null float64 36 Heating 2913 non-null object 37 HeatingQC 2913 non-null object 38 CentralAir 2913 non-null object 39 Electrical 2913 non-null object 40 1stFlrSF 2913 non-null int64 41 2ndFlrSF 2913 non-null int64 42 LowQualFinSF 2913 non-null int64 43 GrLivArea 2913 non-null int64 44 BsmtFullBath 2913 non-null float64 45 BsmtHalfBath 2913 non-null float64 46 FullBath 2913 non-null int64 47 HalfBath 2913 non-null int64 48 BedroomAbvGr 2913 non-null int64 49 KitchenAbvGr 2913 non-null int64 50 KitchenQual 2913 non-null object 51 TotRmsAbvGrd 2913 non-null int64 52 Functional 2913 non-null object 53 Fireplaces 2913 non-null int64 54 FireplaceQu 2913 non-null object 55 GarageType 2913 non-null object 56 GarageYrBlt 2913 non-null float64 57 GarageFinish 2913 non-null object 58 GarageCars 2913 non-null float64 59 GarageArea 2913 non-null float64 60 GarageQual 2913 non-null object 61 GarageCond 2913 non-null object 62 PavedDrive 2913 non-null object 63 WoodDeckSF 2913 non-null int64 64 OpenPorchSF 2913 non-null int64 65 EnclosedPorch 2913 non-null int64 66 3SsnPorch 2913 non-null int64 67 ScreenPorch 2913 non-null int64 68 PoolArea 2913 non-null int64 69 Fence 2913 non-null object 70 MiscFeature 2913 non-null object 71 MiscVal 2913 non-null int64 72 MoSold 2913 non-null object 73 YrSold 2913 non-null object 74 SaleType 2913 non-null object 75 SaleCondition 2913 non-null object 76 BsmtFinType1_Unf 2913 non-null int32 77 HasWoodDeck 2913 non-null int32 78 HasOpenPorch 2913 non-null int32 79 HasEnclosedPorch 2913 non-null int32 80 Has3SsnPorch 2913 non-null int32 81 HasScreenPorch 2913 non-null int32 82 YearsSinceRemodel 2913 non-null int32 83 Total_Home_Quality 2913 non-null int64 84 TotalSF 2913 non-null float64 85 YrBltAndRemod 2913 non-null int64 86 Total_sqr_footage 2913 non-null float64 87 Total_Bathrooms 2913 non-null float64 88 Total_porch_sf 2913 non-null int64 89 haspool 2913 non-null int64 90 has2ndfloor 2913 non-null int64 91 hasgarage 2913 non-null int64 92 hasbsmt 2913 non-null int64 93 hasfireplace 2913 non-null int64 dtypes: float64(14), int32(7), int64(30), object(43) memory usage: 2.0+ MB
# descriptors for ordinal features: we can reduce the feature count a bit if we don't one-hot encode every categorical feature
desc = {
'Alley':{
'None':0,
'Grvl':1,
'Pave':2
},
'Utilities':{
'ELO': 0,
'NoSeWa': 1,
'NoSewr': 2,
'AllPub': 3
},
'ExterQual':{
'Po': 0,
'Fa': 1,
'TA': 2,
'Gd':3,
'Ex':4
},
'ExterCond': {
'Po': 0,
'Fa': 1,
'TA': 2,
'Gd':3,
'Ex':4
},
'KitchenQual': {
'Po': 0,
'Fa': 1,
'TA': 2,
'Gd':3,
'Ex':4
}
}
# convert the above-mentioned features
for col in all_features.columns:
if col in desc.keys():
all_features[col] = all_features[col].map(desc[col])
# onehot encode the rest
all_features = pd.get_dummies(all_features).reset_index(drop=True)
all_features.describe()
| LotFrontage | LotArea | Alley | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | ExterQual | ExterCond | ... | SaleType_New | SaleType_None | SaleType_Oth | SaleType_WD | SaleCondition_Abnorml | SaleCondition_AdjLand | SaleCondition_Alloca | SaleCondition_Family | SaleCondition_Normal | SaleCondition_Partial | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 | ... | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 | 2913.000000 |
| mean | 69.403021 | 10113.405424 | 0.094748 | 6.088225 | 5.566083 | 1971.297288 | 1984.251287 | 100.913835 | 2.395812 | 2.085479 | ... | 0.081016 | 0.000343 | 0.002403 | 0.866117 | 0.065225 | 0.004119 | 0.008239 | 0.015791 | 0.823550 | 0.083076 |
| std | 21.193771 | 7758.911341 | 0.373325 | 1.404996 | 1.113345 | 30.290390 | 20.890446 | 178.092327 | 0.577934 | 0.372342 | ... | 0.272907 | 0.018528 | 0.048970 | 0.340585 | 0.246965 | 0.064062 | 0.090409 | 0.124689 | 0.381268 | 0.276044 |
| min | 21.000000 | 1300.000000 | 0.000000 | 1.000000 | 1.000000 | 1872.000000 | 1950.000000 | 0.000000 | 1.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 60.000000 | 7472.000000 | 0.000000 | 5.000000 | 5.000000 | 1953.000000 | 1965.000000 | 0.000000 | 2.000000 | 2.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 |
| 50% | 70.000000 | 9450.000000 | 0.000000 | 6.000000 | 5.000000 | 1973.000000 | 1993.000000 | 0.000000 | 2.000000 | 2.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 |
| 75% | 80.000000 | 11526.000000 | 0.000000 | 7.000000 | 6.000000 | 2001.000000 | 2004.000000 | 163.000000 | 3.000000 | 2.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 |
| max | 313.000000 | 215245.000000 | 2.000000 | 10.000000 | 9.000000 | 2010.000000 | 2010.000000 | 1600.000000 | 4.000000 | 4.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
8 rows × 333 columns
all_features
| LotFrontage | LotArea | Alley | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | ExterQual | ExterCond | ... | SaleType_New | SaleType_None | SaleType_Oth | SaleType_WD | SaleCondition_Abnorml | SaleCondition_AdjLand | SaleCondition_Alloca | SaleCondition_Family | SaleCondition_Normal | SaleCondition_Partial | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 65.0 | 8450 | 0 | 7 | 5 | 2003 | 2003 | 196.0 | 3 | 2 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 1 | 80.0 | 9600 | 0 | 6 | 8 | 1976 | 1976 | 0.0 | 2 | 2 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2 | 68.0 | 11250 | 0 | 7 | 5 | 2001 | 2002 | 162.0 | 3 | 2 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 3 | 60.0 | 9550 | 0 | 7 | 5 | 1915 | 1970 | 0.0 | 2 | 2 | ... | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
| 4 | 84.0 | 14260 | 0 | 8 | 5 | 2000 | 2000 | 350.0 | 3 | 2 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2908 | 21.0 | 1936 | 0 | 4 | 7 | 1970 | 1970 | 0.0 | 2 | 2 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2909 | 21.0 | 1894 | 0 | 4 | 5 | 1970 | 1970 | 0.0 | 2 | 2 | ... | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
| 2910 | 160.0 | 20000 | 0 | 5 | 7 | 1960 | 1996 | 0.0 | 2 | 2 | ... | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
| 2911 | 62.0 | 10441 | 0 | 5 | 5 | 1992 | 1992 | 0.0 | 2 | 2 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2912 | 74.0 | 9627 | 0 | 7 | 5 | 1993 | 1994 | 94.0 | 2 | 2 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
2913 rows × 333 columns
X_train = all_features.iloc[:len(y_train), :]
X_test = all_features.iloc[len(y_train):, :]
X_train.shape, y_train.shape, X_test.shape
((1454, 333), (1454,), (1459, 333))
We train a single regression tree and we use grid search with 10 fold cross validation for hyperparameter tuning. The grid search is commented out so the notebook can be ran without waiting for it to complete.
# # set the parameter grid
# parameters={
# "max_features": list(np.arange(100, 320, 2)) + [len(all_features.columns)],
# "max_leaf_nodes": list(np.arange(50, 100, 2))
# }
# # get an estimator object
# estimator = DecisionTreeRegressor()
#
# # run optimizer
# t_start = perf_counter()
# sh_dfl = GridSearchCV(estimator, parameters, cv=10, verbose=1).fit(X_train, y_train)
# t_stop = perf_counter()
#
# # reuslts
# print(f'Took {(t_stop-t_start)/60:.2f} mins to optimize\nBest params:')
# [print(f'{k}: {v}') for k, v in sh_dfl.best_params_.items()]
# Example output. I ran this a few times with slightly different settings.
# Fitting 10 folds for each of 2775 candidates, totalling 27750 fits
# Took 13.74 mins to optimize
# Best params:
# max_features: 236
# max_leaf_nodes: 54
# dt = DecisionTreeRegressor(**sh_dfl.best_params_)
dt = DecisionTreeRegressor(max_features=236, max_leaf_nodes=54)
# Setup cross validation folds
kf = KFold(n_splits=25, random_state=999, shuffle=True)
MSE_scorer = make_scorer(mean_squared_error)
scores = np.sqrt(cross_val_score(dt, X_train, y_train, scoring=MSE_scorer, cv=kf))
print(f'single tree scored {np.mean(scores):.6f} with std of {np.std(scores):.6f}')
single tree scored 0.180911 with std of 0.029965
# train on the full dataset
dt_full = dt.fit(X_train, y_train)
# check some tree properties
dt_full.get_depth(), dt_full.get_n_leaves()
(8, 54)
# score
score = np.sqrt(MSE_scorer(dt_full, X_train, y_train))
print(f'MSE score on train data: {score}')
MSE score on train data: 0.12404532522357475
# predict on test set and transform back the preds
preds_test = np.expm1(dt_full.predict(X_test))
preds_final = pd.DataFrame.from_dict({"Id": test_IDs, "SalePrice": preds_test})
preds_final
| Id | SalePrice | |
|---|---|---|
| 0 | 1461 | 124581.694917 |
| 1 | 1462 | 137042.410372 |
| 2 | 1463 | 149121.875345 |
| 3 | 1464 | 197911.415304 |
| 4 | 1465 | 199884.603043 |
| ... | ... | ... |
| 1454 | 2915 | 81539.297943 |
| 1455 | 2916 | 103087.648835 |
| 1456 | 2917 | 149121.875345 |
| 1457 | 2918 | 99079.167550 |
| 1458 | 2919 | 214936.395618 |
1459 rows × 2 columns
preds_final.to_csv("./data/preds_with_hpo.csv", index=None)
For the freestyle mode we use two classical ensemble models and a dense neural network. Hyperparameter search for these can be very computatinally expensive, so it is only used to find a base neural network architecture. We just use the mean of the models' predictions for the final ensemble submission.
# create and train a GB regressor just with default parameters and check if we get a good score
gb = GradientBoostingRegressor()
kf = KFold(n_splits=25, random_state=999, shuffle=True)
MSE_scorer = make_scorer(mean_squared_error)
scores = np.sqrt(cross_val_score(gb, X_train, y_train, scoring=MSE_scorer, cv=kf))
print(f'gb scored {np.mean(scores):.6f} with std of {np.std(scores):.6f}')
gb scored 0.118587 with std of 0.020774
# fit on the full data
gb_full = gb.fit(X_train, y_train)
preds_test_gb = np.expm1(gb_full.predict(X_test))
preds_final_gb = pd.DataFrame.from_dict({"Id": test_IDs, "SalePrice": preds_test_gb})
# same for the random forest
rf = RandomForestRegressor()
kf = KFold(n_splits=25, random_state=999, shuffle=True)
MSE_scorer = make_scorer(mean_squared_error)
scores = np.sqrt(cross_val_score(rf, X_train, y_train, scoring=MSE_scorer, cv=kf))
print(f'rf scored {np.mean(scores):.6f} with std of {np.std(scores):.6f}')
rf scored 0.130527 with std of 0.023893
rf_full = rf.fit(X_train, y_train)
preds_test_rf = np.expm1(rf_full.predict(X_test))
preds_final_rf = pd.DataFrame.from_dict({"Id": test_IDs, "SalePrice": preds_test_gb})
# just copy the data from the previous models and add back the targets for norm
df_norm = X_train.copy()
df_norm['HousePrice'] = y_train
# shuffle split and norm
split = .8
split_id = round(split*len(df_norm))
df_norm = df_norm.sample(frac=1, random_state=333).reset_index(drop=True)
scaler = StandardScaler()
df_norm = scaler.fit_transform(df_norm)
df_train = df_norm[:split_id, :]
df_test = df_norm[split_id:, :]
# separate targets
x_train = df_train[:, :-1].astype('float32')
y_train = df_train[:, -1].astype('float32')
x_eval = df_test[:, :-1].astype('float32')
y_eval = df_test[:, -1].astype('float32')
Searching for a good base architecture using keras_tuner.Hyperband -> this can be much faster than Bayesian optimization or grid search. The code is commented out so the notebook can be ran without waiting for the search.
def model_builder(hp):
max_depth = 3
widths = []
drops = []
depth = hp.Int(f'depth', min_value=1, max_value=max_depth, step=1)
act = hp.Choice('act', values=['swish', 'relu'])
for i in range(max_depth):
widths.append(hp.Int(f'dense_width_{i}', min_value=16, max_value=2048, step=16))
drops.append(hp.Float(f'dropout_val_{i}', min_value=.1, max_value=.7))
inp = Input(shape=(df_train.shape[-1]-1))
for i in range(depth):
if i == 0:
x = Dense(units=round(widths[i]), activation=act)(inp)
else:
x = Dense(units=round(widths[i]), activation=act)(x)
x = Dropout(drops[i])(x)
x = Dense(1, activation='linear')(x)
model = Model(inputs=inp, outputs=x)
model.compile(optimizer=Adam(), loss=tf.keras.losses.mean_squared_error)
return model
# tuner = keras_tuner.Hyperband(
# model_builder,
# max_epochs=100,
# factor=3,
# objective="val_loss",
# directory='./data/DL',
# project_name=f'HS_LOGS',
# overwrite=True
# )
# tuner.search_space_summary()
# tuner.search(x_train, y_train, epochs=100, validation_split=.2, batch_size=128, verbose=1,
# shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)])
#
# models = tuner.get_best_models(num_models=5)
#
# best_model = models[0]
#
# print(tuner.results_summary())
# print(best_model.summary())
#
# original_stdout = sys.stdout
# with open(f'./data/DL/LOG.log', 'w') as f:
# sys.stdout = f
# print(f'\n***TUNER SUMMARY***\n')
# print(tuner.results_summary())
#
# print(f'\n\n***MODELS SUMMARY***\n')
# for rank, model in enumerate(models):
# print(f'\nMODEL RANK {rank} - STRUCTURE:')
# print(model.summary())
# sys.stdout = original_stdout
# best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
# model_1 = tuner.hypermodel.build(best_hps)
# print(model_1.summary())
The parameters can be seen in the LOG.log file. We can now build a similar model.
# for reusability wrap into a func
def build():
inp = Input(shape=(df_train.shape[-1]-1))
x = Dense(1000, activation='relu')(inp)
x = Dropout(.5)(x)
x = Dense(100, activation='relu')(x)
x = Dropout(.2)(x)
out = Dense(1, activation='linear')(x)
model = Model(inputs=inp, outputs=out)
model.compile(optimizer='adam', loss=tf.keras.losses.MSE)
print(model.summary())
return model
# define a checkpont callback
callbacks = [
ModelCheckpoint(f'./data/DL/model_weights_best.h5',
monitor='val_loss', verbose=1,
save_best_only=True, mode='min')
]
# get the model
model = build()
# fit
hist = model.fit(x=x_train,
y=y_train,
validation_data=(x_eval, y_eval),
epochs=150,
batch_size=32,
callbacks=callbacks,
verbose=0)
Model: "model"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_1 (InputLayer) [(None, 333)] 0
dense (Dense) (None, 1000) 334000
dropout (Dropout) (None, 1000) 0
dense_1 (Dense) (None, 100) 100100
dropout_1 (Dropout) (None, 100) 0
dense_2 (Dense) (None, 1) 101
=================================================================
Total params: 434,201
Trainable params: 434,201
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1: val_loss improved from inf to 0.41158, saving model to ./data/DL\model_weights_best.h5
Epoch 2: val_loss improved from 0.41158 to 0.28557, saving model to ./data/DL\model_weights_best.h5
Epoch 3: val_loss improved from 0.28557 to 0.25773, saving model to ./data/DL\model_weights_best.h5
Epoch 4: val_loss improved from 0.25773 to 0.19066, saving model to ./data/DL\model_weights_best.h5
Epoch 5: val_loss did not improve from 0.19066
Epoch 6: val_loss did not improve from 0.19066
Epoch 7: val_loss did not improve from 0.19066
Epoch 8: val_loss did not improve from 0.19066
Epoch 9: val_loss improved from 0.19066 to 0.14923, saving model to ./data/DL\model_weights_best.h5
Epoch 10: val_loss did not improve from 0.14923
Epoch 11: val_loss improved from 0.14923 to 0.13262, saving model to ./data/DL\model_weights_best.h5
Epoch 12: val_loss did not improve from 0.13262
Epoch 13: val_loss did not improve from 0.13262
Epoch 14: val_loss improved from 0.13262 to 0.12666, saving model to ./data/DL\model_weights_best.h5
Epoch 15: val_loss did not improve from 0.12666
Epoch 16: val_loss did not improve from 0.12666
Epoch 17: val_loss did not improve from 0.12666
Epoch 18: val_loss did not improve from 0.12666
Epoch 19: val_loss did not improve from 0.12666
Epoch 20: val_loss improved from 0.12666 to 0.11815, saving model to ./data/DL\model_weights_best.h5
Epoch 21: val_loss did not improve from 0.11815
Epoch 22: val_loss did not improve from 0.11815
Epoch 23: val_loss did not improve from 0.11815
Epoch 24: val_loss did not improve from 0.11815
Epoch 25: val_loss did not improve from 0.11815
Epoch 26: val_loss did not improve from 0.11815
Epoch 27: val_loss did not improve from 0.11815
Epoch 28: val_loss did not improve from 0.11815
Epoch 29: val_loss did not improve from 0.11815
Epoch 30: val_loss did not improve from 0.11815
Epoch 31: val_loss improved from 0.11815 to 0.11511, saving model to ./data/DL\model_weights_best.h5
Epoch 32: val_loss did not improve from 0.11511
Epoch 33: val_loss did not improve from 0.11511
Epoch 34: val_loss did not improve from 0.11511
Epoch 35: val_loss did not improve from 0.11511
Epoch 36: val_loss did not improve from 0.11511
Epoch 37: val_loss did not improve from 0.11511
Epoch 38: val_loss improved from 0.11511 to 0.11296, saving model to ./data/DL\model_weights_best.h5
Epoch 39: val_loss did not improve from 0.11296
Epoch 40: val_loss did not improve from 0.11296
Epoch 41: val_loss did not improve from 0.11296
Epoch 42: val_loss did not improve from 0.11296
Epoch 43: val_loss did not improve from 0.11296
Epoch 44: val_loss did not improve from 0.11296
Epoch 45: val_loss did not improve from 0.11296
Epoch 46: val_loss did not improve from 0.11296
Epoch 47: val_loss did not improve from 0.11296
Epoch 48: val_loss did not improve from 0.11296
Epoch 49: val_loss did not improve from 0.11296
Epoch 50: val_loss did not improve from 0.11296
Epoch 51: val_loss did not improve from 0.11296
Epoch 52: val_loss did not improve from 0.11296
Epoch 53: val_loss did not improve from 0.11296
Epoch 54: val_loss did not improve from 0.11296
Epoch 55: val_loss did not improve from 0.11296
Epoch 56: val_loss did not improve from 0.11296
Epoch 57: val_loss did not improve from 0.11296
Epoch 58: val_loss did not improve from 0.11296
Epoch 59: val_loss did not improve from 0.11296
Epoch 60: val_loss did not improve from 0.11296
Epoch 61: val_loss did not improve from 0.11296
Epoch 62: val_loss did not improve from 0.11296
Epoch 63: val_loss did not improve from 0.11296
Epoch 64: val_loss did not improve from 0.11296
Epoch 65: val_loss did not improve from 0.11296
Epoch 66: val_loss did not improve from 0.11296
Epoch 67: val_loss did not improve from 0.11296
Epoch 68: val_loss did not improve from 0.11296
Epoch 69: val_loss did not improve from 0.11296
Epoch 70: val_loss did not improve from 0.11296
Epoch 71: val_loss did not improve from 0.11296
Epoch 72: val_loss did not improve from 0.11296
Epoch 73: val_loss did not improve from 0.11296
Epoch 74: val_loss did not improve from 0.11296
Epoch 75: val_loss did not improve from 0.11296
Epoch 76: val_loss did not improve from 0.11296
Epoch 77: val_loss did not improve from 0.11296
Epoch 78: val_loss did not improve from 0.11296
Epoch 79: val_loss did not improve from 0.11296
Epoch 80: val_loss did not improve from 0.11296
Epoch 81: val_loss did not improve from 0.11296
Epoch 82: val_loss did not improve from 0.11296
Epoch 83: val_loss did not improve from 0.11296
Epoch 84: val_loss did not improve from 0.11296
Epoch 85: val_loss did not improve from 0.11296
Epoch 86: val_loss did not improve from 0.11296
Epoch 87: val_loss did not improve from 0.11296
Epoch 88: val_loss did not improve from 0.11296
Epoch 89: val_loss did not improve from 0.11296
Epoch 90: val_loss did not improve from 0.11296
Epoch 91: val_loss did not improve from 0.11296
Epoch 92: val_loss did not improve from 0.11296
Epoch 93: val_loss did not improve from 0.11296
Epoch 94: val_loss did not improve from 0.11296
Epoch 95: val_loss did not improve from 0.11296
Epoch 96: val_loss did not improve from 0.11296
Epoch 97: val_loss did not improve from 0.11296
Epoch 98: val_loss did not improve from 0.11296
Epoch 99: val_loss did not improve from 0.11296
Epoch 100: val_loss did not improve from 0.11296
Epoch 101: val_loss did not improve from 0.11296
Epoch 102: val_loss did not improve from 0.11296
Epoch 103: val_loss did not improve from 0.11296
Epoch 104: val_loss did not improve from 0.11296
Epoch 105: val_loss did not improve from 0.11296
Epoch 106: val_loss did not improve from 0.11296
Epoch 107: val_loss did not improve from 0.11296
Epoch 108: val_loss did not improve from 0.11296
Epoch 109: val_loss did not improve from 0.11296
Epoch 110: val_loss did not improve from 0.11296
Epoch 111: val_loss did not improve from 0.11296
Epoch 112: val_loss did not improve from 0.11296
Epoch 113: val_loss did not improve from 0.11296
Epoch 114: val_loss did not improve from 0.11296
Epoch 115: val_loss did not improve from 0.11296
Epoch 116: val_loss did not improve from 0.11296
Epoch 117: val_loss did not improve from 0.11296
Epoch 118: val_loss did not improve from 0.11296
Epoch 119: val_loss did not improve from 0.11296
Epoch 120: val_loss did not improve from 0.11296
Epoch 121: val_loss did not improve from 0.11296
Epoch 122: val_loss did not improve from 0.11296
Epoch 123: val_loss did not improve from 0.11296
Epoch 124: val_loss did not improve from 0.11296
Epoch 125: val_loss did not improve from 0.11296
Epoch 126: val_loss did not improve from 0.11296
Epoch 127: val_loss did not improve from 0.11296
Epoch 128: val_loss did not improve from 0.11296
Epoch 129: val_loss did not improve from 0.11296
Epoch 130: val_loss did not improve from 0.11296
Epoch 131: val_loss did not improve from 0.11296
Epoch 132: val_loss did not improve from 0.11296
Epoch 133: val_loss did not improve from 0.11296
Epoch 134: val_loss did not improve from 0.11296
Epoch 135: val_loss did not improve from 0.11296
Epoch 136: val_loss did not improve from 0.11296
Epoch 137: val_loss did not improve from 0.11296
Epoch 138: val_loss did not improve from 0.11296
Epoch 139: val_loss did not improve from 0.11296
Epoch 140: val_loss did not improve from 0.11296
Epoch 141: val_loss did not improve from 0.11296
Epoch 142: val_loss did not improve from 0.11296
Epoch 143: val_loss did not improve from 0.11296
Epoch 144: val_loss did not improve from 0.11296
Epoch 145: val_loss did not improve from 0.11296
Epoch 146: val_loss did not improve from 0.11296
Epoch 147: val_loss did not improve from 0.11296
Epoch 148: val_loss did not improve from 0.11296
Epoch 149: val_loss did not improve from 0.11296
Epoch 150: val_loss did not improve from 0.11296
plt.figure(10, (10, 5))
plt.plot(hist.history['val_loss'])
plt.plot(hist.history['loss'])
plt.legend(['val_loss', 'loss'])
<matplotlib.legend.Legend at 0x242039e5b40>
We can see overfitting starts at around 50 epochs. This is important for when we want to train the model on the whole dataset. Now let's check performance.
# load back
model.load_weights('./data/DL/model_weights_best.h5')
# predict
y_preds = model.predict(x_eval, batch_size=10)
30/30 [==============================] - 0s 2ms/step
# some trickery to get back to the original scale
preds_dummy = np.zeros((x_eval.shape[0], x_eval.shape[1] + 1))
preds_dummy[:, -1] = np.squeeze(y_preds)
y_eval_dummy = np.zeros((x_eval.shape[0], x_eval.shape[1] + 1))
y_eval_dummy[:, -1] = np.squeeze(y_eval)
preds_rescaled = scaler.inverse_transform(preds_dummy)[:, -1]
y_eval_rescaled = scaler.inverse_transform(y_eval_dummy)[:, -1]
# evaluate
rmse(y_eval_rescaled, preds_rescaled)
0.13422389567364781
# final training on all training data
dl = build()
Model: "model_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_2 (InputLayer) [(None, 333)] 0
dense_3 (Dense) (None, 1000) 334000
dropout_2 (Dropout) (None, 1000) 0
dense_4 (Dense) (None, 100) 100100
dropout_3 (Dropout) (None, 100) 0
dense_5 (Dense) (None, 1) 101
=================================================================
Total params: 434,201
Trainable params: 434,201
Non-trainable params: 0
_________________________________________________________________
None
dl.fit(x=np.vstack((x_train, x_eval)),
y=np.vstack((y_train.reshape(-1,1), y_eval.reshape(-1,1))),
epochs=50,
batch_size=32,
verbose=0)
<keras.callbacks.History at 0x2438c2d2da0>
# normalize test data
x_test = scaler.transform(np.hstack((X_test, np.zeros((X_test.shape[0],1)))))[:,:-1]
C:\Dev\Anaconda\envs\elte_ai\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names
# predict on test data
y_preds_test = dl.predict(x_test, batch_size=10)
146/146 [==============================] - 0s 1ms/step
# scale back
preds_dummy_test = np.zeros((x_test.shape[0], x_test.shape[1] + 1))
preds_dummy_test[:, -1] = np.squeeze(y_preds_test)
preds_rescaled_test = np.expm1(scaler.inverse_transform(preds_dummy_test)[:, -1])
preds_test_dl = preds_rescaled_test
preds_final_dl = pd.DataFrame.from_dict({"Id": test_IDs, "SalePrice": preds_rescaled_test})
preds_final_dl
| Id | SalePrice | |
|---|---|---|
| 0 | 1461 | 131534.669693 |
| 1 | 1462 | 165172.578058 |
| 2 | 1463 | 179471.864419 |
| 3 | 1464 | 196345.965907 |
| 4 | 1465 | 179432.165070 |
| ... | ... | ... |
| 1454 | 2915 | 94894.202533 |
| 1455 | 2916 | 88548.394947 |
| 1456 | 2917 | 168072.988735 |
| 1457 | 2918 | 118933.102863 |
| 1458 | 2919 | 210714.324294 |
1459 rows × 2 columns
# save dl only freestyle preds
preds_final_dl.to_csv("./data/freestyle_preds.csv", index=None)
# create ensemble preds
preds_test_ensemble = np.mean(np.hstack((preds_test_gb.reshape(-1,1), preds_test_rf.reshape(-1,1), preds_test_dl.reshape(-1,1))), axis=1)
preds_final_ensemble = pd.DataFrame.from_dict({"Id": test_IDs, "SalePrice": preds_test_ensemble})
preds_final_ensemble
| Id | SalePrice | |
|---|---|---|
| 0 | 1461 | 131212.174619 |
| 1 | 1462 | 165875.090142 |
| 2 | 1463 | 176887.468719 |
| 3 | 1464 | 192306.382069 |
| 4 | 1465 | 189384.589165 |
| ... | ... | ... |
| 1454 | 2915 | 89079.854580 |
| 1455 | 2916 | 86090.332983 |
| 1456 | 2917 | 169209.946253 |
| 1457 | 2918 | 112761.835770 |
| 1458 | 2919 | 219134.774701 |
1459 rows × 2 columns
# save dl only freestyle preds -> this got 0.13578 in Kaggle
preds_final_ensemble.to_csv("./data/freestyle_ensemble_preds.csv", index=None)